1 Revision History

This version

Current version: 1
Date report generated: 26 October, 2020
Report prepared for: Daniel Desaulniers, Cathy Cummings-Lorbetskie
Purpose of report:

  • Present exploratory results from RNASeq experiments

Previous revisions

N/A


2 Steps in Data Analysis

This report is meant to help explore DESeq2 results and was generated using RMarkdown. This section contains the code for setting up the rest of the report.

2.1 Load R libraries

## knitrBoostrap and device chunk options
library('knitr')
# Set so that long lines in R will be wrapped:
opts_chunk$set(bootstrap.show.code = FALSE, bootstrap.panel = TRUE, cache = TRUE) # tidy.opts=list(width.cutoff=60), tidy=TRUE, crop = NULL
# Set this variable to FALSE if you only want to run the plotting functions by reloading the last RData file.
flag=TRUE # To eveluate analysis chunks
if (flag==FALSE) { load("Partial_analysis.RData") }
# Set this variable to TRUE if you would like to embed the files directly into the HTML for portability. This slows down page responsiveness drastically, since the files are generally quite large.
embedFiles=FALSE
# Se this variable to be TRUE if you want to have separate plots of top genes as defined in the R-ODAF template
R_ODAF_plots=FALSE 
message("If this text is visible by default, this report was produced to test plotting functions and should be used exclusively for testing and development.")
#### Load Libraries

## Record start time
startTime <- Sys.time()

## Set libPaths
.libPaths(c("/home/mpiage/R/x86_64-pc-linux-gnu-library/3.6",.libPaths()))

## Bioconductor
library('DESeq2')
library('edgeR')
#library('org.Ca.eg.db') # MOUSE: library('org.Mm.eg.db') # HUMAN: library('org.Hs.eg.db') # RAT: library('org.Rn.eg.db') # HAMSTER: library('org.Ca.eg.db')
library('AnnotationHub')
OrgDb.Ma <- query(AnnotationHub(), c("OrgDb","Mesocricetus auratus"))[[1]] # Golden Hamster
library('enrichplot')
library('rWikiPathways')
library('BiocParallel')

## CRAN
library('ggplot2')                                 
theme_set(theme_bw())
library('knitr')
library('RColorBrewer')
library('viridis')
library('pheatmap')
library('DT')
library('sessioninfo')
library('plotly')

## Other packages
packages <- c("RMariaDB",
              "clusterProfiler",
              "biomaRt",
              "regionReport",
              "magrittr",
              "vsn",
              "lattice",
              "pheatmap",
              "dplyr",
              "data.table",
              "forcats",
              "tidytext",
              "openxlsx",
              "kableExtra"
              )

invisible(suppressPackageStartupMessages(lapply(packages, function(x)require(x, character.only = T, quietly = T))))
options(java.parameters = "-Xmx10000m")

The code above (not shown by default) loads the relevant R packages required for analysis.

2.2 Specify parameters

###################################################################################
###################################################################################
# PARAMETERS TO SET MANUALLY                                     

# Set file locations                
    sampledir <- "/home/mpiage/data/2020_hamster_RNAseq_Desaulniers/"
  setwd(sampledir)
    outputdir <- paste(sampledir, "./DEG_output/", sep="")
    if(!dir.exists(outputdir)) {dir.create(outputdir)}
# Names of files to load
    SampleDataFile <- "genes.data.tsv" #This tab delimited file contains the merged RSEM.genes.results files
    SampleKeyFile <- "metadata.txt" #This tab delimited file contains at least 2 columns: NAME (sample names identical to the column names of sampleData) and Compound (needs to identify to which group the sample belongs -> ExperimentalGroup & ControlGroup)
    ContrastsFile <- "contrasts.txt" #This tab delimited file contains one row per contrast, with the control for comparison in the right column and the comparison in the left column
# Specify which groups need to be compared 
    contrasts <- read.delim(ContrastsFile, stringsAsFactors=FALSE, sep="\t", header=FALSE,  quote="\"")
    short_contrast_names <- paste(contrasts$V1,"v.",contrasts$V2) # Customize these for your experiment... Must be short enough to fit as Excel tab titles.
    intgroup <- DESIGN <- "group"   #Column name which defines the groups to be compared
  nBestFeatures <- 20 # The number of best features to make plots of their counts
  nBest <- 100 # Number of features to include in table and limiting PCA/clustering analysis
  nHeatmap <- 50 # Number of most variable genes for heatmap
  nHeatmapDEGs <- 50 # Number of DEGs for heatmap
# Set analysis ID. This ID will be used as prefix for the output files
    analysisID <-"2020_"
# Specify used platform/technology for data generation:
    Platform <- "RNA-Seq" # Specify "RNA-Seq" or "TempO-seq"

# Misc parameters
    digits = 2 # For rounding numbers

The code above (not shown by default) specifies user preferences and data locations.

The experimental comparisons of interest are as follows: 1 5adC v. 0 Naive, 5 5adC v. 0 Naive, 1 5adC v. 0 Vehicle, 5 5adC v. 0 Vehicle, 0 Vehicle v. 0 Naive.

2.3 Load data

# Load input files 
setwd(sampledir)
sampleData <- read.delim(SampleDataFile,
                         sep="\t",
                         stringsAsFactors=FALSE,
                         header=TRUE, 
                         quote="\"",
                         row.names=1,
                         check.names=FALSE)
DESeqDesign <- read.delim(SampleKeyFile,
                          stringsAsFactors=FALSE,
                          sep="\t",
                          header=TRUE,
                          quote="\"",
                          row.names="name") # Pick column that is used in ID

NORM_TYPE<-paste0(analysisID, "_DESeq2_", Platform)

plotdir <- paste(outputdir, "/plots/", sep="")
if(!dir.exists(plotdir)) {dir.create(plotdir)}
barplot.dir<- paste(plotdir, "/barplot_genes/", sep="")
if(!dir.exists(barplot.dir)) {dir.create(barplot.dir)}

The code above (not shown by default) loads user-provided meta data (i.e., information about your experiment, also known as colData, or, column data). This also imports the countx matrix (i.e., a table of observed counts in which each sample is a column and genes are rows).

2.4 Run DESeq2

##########
# DESeq2 #
##########
print(NORM_TYPE) # Name of experiment
## [1] "2020__DESeq2_RNA-Seq"

# First data clean-up: replace NA & remove samples with total readcount < threshold
threshold = 1000000
initialSampleDataCount <- ncol(sampleData)
sampleData[ is.na(sampleData) ] <- 0 
sampleData <- sampleData[,(colSums(sampleData) > threshold)] # 1 million reads required per sample
filteredSampleDataCount <- ncol(sampleData)
# Sometimes extra cleanup may be needed
# colnames(sampleData) <- gsub(pattern="^0", replacement="", x=colnames(sampleData))

samples_before <- nrow(DESeqDesign)
DESeqDesign$original_names <- rownames(DESeqDesign)
metadata_in_sampledata <- all(rownames(DESeqDesign) %in% colnames(sampleData)) # Sanity check: each sample name (row) in the metadata should have a corresponding column in the count data
sampledata_in_metadata <- all(colnames(sampleData) %in% rownames(DESeqDesign)) # Sanity check: each sample name (row) in the metadata should have a corresponding column in the count data
removed <- colnames(sampleData[which(!colnames(sampleData) %in% rownames(DESeqDesign))])
DESeqDesign <- DESeqDesign[(colnames(sampleData)),] # Reorder the metadata table to correspond to the order of columns in the count data
DESeqDesign <- na.omit(DESeqDesign)
DESeqDesign <- DESeqDesign[ DESeqDesign$treatment != "human reference RNA",] # Remove samples manually
sampleData <- sampleData[,(rownames(DESeqDesign))]
samples_after <- nrow(DESeqDesign)


# For TempO-Seq: Limit sampleData matrix to genes in the assay.
if (Platform=="TempO-Seq") {
biospyder <- read.delim("~/dbs/biospyder/191113_Human_S1500_Surrogate_2.0_Manifest.csv", # Assay manifest...
                          stringsAsFactors=FALSE,
                          sep="\t",
                          header=TRUE,
                          quote="\"")
sampleData <- sampleData[row.names(sampleData) %in% biospyder$ENSEMBL_GENE_ID,]
}

head(rownames(DESeqDesign))
## [1] "1_um_5adC_rep_1" "1_um_5adC_rep_2" "1_um_5adC_rep_3" "1_um_5adC_rep_4"
## [5] "5_um_5adC_rep_1" "5_um_5adC_rep_2"
head(colnames(sampleData)) # Output should match
## [1] "1_um_5adC_rep_1" "1_um_5adC_rep_2" "1_um_5adC_rep_3" "1_um_5adC_rep_4"
## [5] "5_um_5adC_rep_1" "5_um_5adC_rep_2"

setwd(outputdir)
if (file.exists("dds.Rdata")) {
  print(paste("Already found DESeq2 object from previous run; loading from disk."))
  load("./dds.Rdata")
  if (!identical(as.data.frame(round(counts(dds))),
                 round(sampleData),0)) {
    print("Not identical")
    }
  } else {
  dds <- DESeqDataSetFromMatrix(countData = round(sampleData),
                                colData = as.data.frame(DESeqDesign),
                                design = ~group )
  dds <- dds[, rownames(DESeqDesign)]
  dds <- dds[rowSums(counts(dds)) > 1]
  dds <- DESeq(dds, parallel = TRUE, BPPARAM=MulticoreParam(39))
  save(dds, file="./dds.Rdata")
  }
## [1] "Already found DESeq2 object from previous run; loading from disk."
## [1] "Not identical"

# ### REMOVE if done above from scratch
# keep <- grep(colData(dds)$chemical, pattern="cells", invert=T, ignore.case = T)
# dds <- dds[,keep]
# dds <- dds[row.names(dds) %in% biospyder$ENSEMBL_GENE_ID,]

# Another sanity check to make sure the object looks correct
resultsNames(dds)
## [1] "Intercept"                  "group_0.Vehicle_vs_0.Naive"
## [3] "group_1.5adC_vs_0.Naive"    "group_5.5adC_vs_0.Naive"
head(colData(dds))
## DataFrame with 6 rows and 6 columns
##                      dose   treatment   replicate    group  original_names
##                 <integer> <character> <character> <factor>     <character>
## 1_um_5adC_rep_1         1        5adC        rep1   1 5adC 1_um_5adC_rep_1
## 1_um_5adC_rep_2         1        5adC        rep2   1 5adC 1_um_5adC_rep_2
## 1_um_5adC_rep_3         1        5adC        rep3   1 5adC 1_um_5adC_rep_3
## 1_um_5adC_rep_4         1        5adC        rep4   1 5adC 1_um_5adC_rep_4
## 5_um_5adC_rep_1         5        5adC        rep1   5 5adC 5_um_5adC_rep_1
## 5_um_5adC_rep_2         5        5adC        rep2   5 5adC 5_um_5adC_rep_2
##                        sizeFactor
##                         <numeric>
## 1_um_5adC_rep_1 0.851980611293044
## 1_um_5adC_rep_2 0.848279595118718
## 1_um_5adC_rep_3 0.900285964389282
## 1_um_5adC_rep_4  1.07172382023083
## 5_um_5adC_rep_1  1.38338182362141
## 5_um_5adC_rep_2  1.10690151339656
head(assay(dds))
##                    1_um_5adC_rep_1 1_um_5adC_rep_2 1_um_5adC_rep_3
## ENSMAUG00000000002            1001            1337            1336
## ENSMAUG00000000004            5758            7318            7258
## ENSMAUG00000000005               5               2               2
## ENSMAUG00000000006           14640           17361           17928
## ENSMAUG00000000008               0               0               0
## ENSMAUG00000000010           21753           26360           28254
##                    1_um_5adC_rep_4 5_um_5adC_rep_1 5_um_5adC_rep_2
## ENSMAUG00000000002            1612            2412            1762
## ENSMAUG00000000004            8802           12752           10116
## ENSMAUG00000000005               2               0               2
## ENSMAUG00000000006           21259           27569           25418
## ENSMAUG00000000008               0               0               0
## ENSMAUG00000000010           32936           42322           38865
##                    5_um_5adC_rep_3 5_um_5adC_rep_4 Naive_rep1 Naive_rep2
## ENSMAUG00000000002            1540            1465       1368       1490
## ENSMAUG00000000004            8431            7952       7194       7747
## ENSMAUG00000000005               0               3          0          1
## ENSMAUG00000000006           19777           19002      18877      18467
## ENSMAUG00000000008               0               0          1          0
## ENSMAUG00000000010           31088           28897      28091      29058
##                    Naive_rep3 Naive_rep4 Vehicle_rep1 Vehicle_rep2 Vehicle_rep3
## ENSMAUG00000000002       1239       1035         1998         1970         9585
## ENSMAUG00000000004       5852       5643        11140        10190        24510
## ENSMAUG00000000005          0          0            2            1            1
## ENSMAUG00000000006      15495      15016        27658        22377        28300
## ENSMAUG00000000008          1          0            0            0            0
## ENSMAUG00000000010      23597      24018        41923        34782        45531
##                    Vehicle_rep4
## ENSMAUG00000000002         2263
## ENSMAUG00000000004        13687
## ENSMAUG00000000005            1
## ENSMAUG00000000006        31726
## ENSMAUG00000000008            0
## ENSMAUG00000000010        48759
head(rowRanges(dds))
## GRangesList object of length 6:
## $ENSMAUG00000000002 
## GRanges object with 0 ranges and 0 metadata columns:
##    seqnames    ranges strand
##       <Rle> <IRanges>  <Rle>
## 
## $ENSMAUG00000000004 
## GRanges object with 0 ranges and 0 metadata columns:
##      seqnames ranges strand
## 
## $ENSMAUG00000000005 
## GRanges object with 0 ranges and 0 metadata columns:
##      seqnames ranges strand
## 
## ...
## <3 more elements>
## -------
## seqinfo: no sequences
str(counts(dds))
##  int [1:15971, 1:16] 1001 5758 5 14640 0 21753 0 0 0 93694 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:15971] "ENSMAUG00000000002" "ENSMAUG00000000004" "ENSMAUG00000000005" "ENSMAUG00000000006" ...
##   ..$ : chr [1:16] "1_um_5adC_rep_1" "1_um_5adC_rep_2" "1_um_5adC_rep_3" "1_um_5adC_rep_4" ...

#Set parameters according to platform
if (Platform=="RNA-Seq"){
  MinCount<- 1
  alpha <- pAdjValue <- 0.05 # Relaxed from 0.01
  linear_fc_filter = 1.5
} else if (Platform=="TempO-Seq") {
  MinCount<- 0.5
  alpha <- pAdjValue<- 0.05 
  linear_fc_filter = 1.5
} else { print("Platform/technology not recognized") }


# Make regularized log object for later plotting
#rld <- tryCatch(rlog(dds), error = function(e) { rlog(dds, fitType = 'mean') })
# Use vst for hundreds of samples!
rld <- vst(dds) # Should this be blind?
 

The code above (not shown by default) uses DESeq2 1.24.0 to test for differentially abundant genes within the RNA-Seq data.

Prior to running DESeq2, the data was filtered to remove samples that do not have 10^{6} reads per sample.

The user-provided metadata initially included 18 samples.

The count matrix initially included 19 samples (including any reference material samples). After removing samples with less than 10^{6} reads, 19 samples were left. It is TRUE that all the samples provided in the metadata table were also identified in the count matrix. It is FALSE that all the samples in the count matrix were also identified in the metadata table.

Following the removal of other samples (i.e., reference RNA, etc.), there were 16 samples remaining in the experiment. The samples excluded from analysis were named: Undetermined.

2.5 Sample data (metadata about your experiment)

This table shows the final list of samples that were used in the data analysis (as well as corresponding sample information as it was communicated to the Genomics Laboratory).

knitr::kable(DESeqDesign,
             row.names =  F,
             caption="User-provided information about samples and experimental conditions") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))  %>%
  scroll_box(width = "720px")
User-provided information about samples and experimental conditions
dose treatment replicate group original_names
1 5adC rep1 1 5adC 1_um_5adC_rep_1
1 5adC rep2 1 5adC 1_um_5adC_rep_2
1 5adC rep3 1 5adC 1_um_5adC_rep_3
1 5adC rep4 1 5adC 1_um_5adC_rep_4
5 5adC rep1 5 5adC 5_um_5adC_rep_1
5 5adC rep2 5 5adC 5_um_5adC_rep_2
5 5adC rep2 5 5adC 5_um_5adC_rep_3
5 5adC rep4 5 5adC 5_um_5adC_rep_4
0 Naive rep1 0 Naive Naive_rep1
0 Naive rep2 0 Naive Naive_rep2
0 Naive rep3 0 Naive Naive_rep3
0 Naive rep4 0 Naive Naive_rep4
0 Vehicle rep1 0 Vehicle Vehicle_rep1
0 Vehicle rep2 0 Vehicle Vehicle_rep2
0 Vehicle rep3 0 Vehicle Vehicle_rep3
0 Vehicle rep4 0 Vehicle Vehicle_rep4

2.6 Load R-ODAF functions

###################################################################################
#DEFINE FUNCTIONS
###################################################################################
plot.barplots<-function(samples,b) {
  color <- NULL
  for (h in 1:ncol(norm_data)){
    if (substring(colnames(norm_data)[h], 1, 3) == substring(condition2, 1, 3)) { color <- c(color, "red3") } else { color <- c(color, "darkgrey")}
  }
  fileNamePlot <- paste0(b, row.names(samples)[b], ".png")
  pseudoTitle <- paste0(row.names(samples)[b], "_pAdj:", samples[b,"padj"])
  
  png(file=paste(fileNamePlot, sep="/"), width=1200, height=700, pointsize=20)
  par(mar=c(8,4,3,1))
  barplot(as.numeric(norm_data[row.names(samples)[b],]), las=2, col=color, main=pseudoTitle, cex.names=0.5,  cex.axis=0.8, names.arg=colnames(norm_data)) 
  dev.off()
} #plot.barplots function done

###################################################################################
draw.barplots<-function(samples, top_bottom, NUM){
  if (nrow(samples) == 0) {
    #print("no genes to plot") 
  } else { 
    if (top_bottom == "top") {
      #print(paste0("drawing Top ", NUM, " plots"))
      if (nrow(samples) <= NUM) { 
        for (b in 1:nrow(samples)) {plot.barplots(samples,b)}
      }
      
      if (nrow(samples) > NUM) { 
        for (b in 1:NUM) {plot.barplots(samples,b)} 
      } 
    }
    
    if (top_bottom == "bottom") {
      #print(paste0("drawing Bottom", NUM, " plots"))
      if (nrow(samples) <= NUM) { 
        for (b in 1:nrow(samples)) {plot.barplots(samples,b)}
      }
      if (nrow(samples) > NUM) { 
        for (b in ((nrow(samples)-NUM+1):nrow(samples))) {plot.barplots(DEsamples,b)}
      }
    }}
} #draw.barplots function done

###################################################################################
###################################################################################

The code above (not shown by default) loads in plotting functions specific to the Omics Data Analysis Frameworks for Regulatory application (R-ODAF) template. More information on the R-ODAF framework is available here.

2.7 Run pairwise comparisons

cooks <- FALSE # Normally set Cook's cutoff to false
resList <- list()
for (x in 1:nrow(contrasts)) {  ## for all comparisons to be done   
    
  condition1 <- contrasts[x,2]
    condition2 <- contrasts[x,1]
    print(paste(condition2, " vs ", condition1, ":", NORM_TYPE))
    
    DE_Design <- matrix(data=NA, ncol=2)
    DE_Design <- DESeqDesign[c(grep(condition1,DESeqDesign[,DESIGN]), grep(condition2,DESeqDesign[,DESIGN])),]
    samples <- sampleData[, rownames(DE_Design) ]
    colnames(samples) <- NULL
    
    ###########

    print(paste0("Filtering genes: 75% of at least 1 group need to be above ", MinCount, " CPM"))
    print("AND")
    print("Detecting spurious spikes: Max-Median > Sum/(Rep+1)" )
        SampPerGroup<-table(DE_Design[,DESIGN])
        idx<-FlagSpike<-NameRows<-NULL
        Counts<-counts(dds, normalized=TRUE)
        CPMdds<-cpm(counts(dds, normalized=TRUE))
        for (gene in 1:nrow(dds)) {
            GroupsPass <- checkSpike<-NULL
            for (group in 1:length(SampPerGroup)) { #test if group passes
                sampleCols<-grep(dimnames(SampPerGroup)[[1]][group],DE_Design[,DESIGN])
                Check<-sum(CPMdds[gene,sampleCols] >= MinCount)>= 0.75*SampPerGroup[group]
                GroupsPass<-c(GroupsPass, Check)
                if (Check == FALSE) {checkSpike<- c(checkSpike, Check)} else {
                    checkSpike<-c(checkSpike, ((max(Counts[gene,sampleCols])-median(Counts[gene,sampleCols])) >=
                                                 (sum(Counts[gene,sampleCols])/(SampPerGroup[group]+1))))
                }
            }
            idx <- c(idx, as.logical(sum(GroupsPass)))
            if (sum(checkSpike) >=1) {
                FlagSpike<-rbind(FlagSpike, Counts[gene,])
                NameRows<<-c(NameRows, row.names(Counts)[gene])
                row.names(FlagSpike)<-NameRows 
            }
        }

    print("Obtaining the DESeq2 results")
    currentContrast <- c(DESIGN, condition2, condition1)
    res <- results(dds[idx],
                   parallel = TRUE, BPPARAM=MulticoreParam(39),
                   contrast=currentContrast,
                   pAdjustMethod= 'fdr',
                   cooksCutoff=cooks) # Cooks cutoff disabled - manually inspect.
    res <- lfcShrink(dds=dds[idx],
                     contrast=currentContrast,
                     res=res,
                     type="ashr")
    #Make new directory for the ODAF-specific files
    ODAFdir <- paste(outputdir, "/R-ODAF/", sep="")
    if(!dir.exists(ODAFdir)) {dir.create(ODAFdir)}
    setwd(ODAFdir)
    FileName <- paste(NORM_TYPE, condition2,"vs",condition1, "FDR", pAdjValue, sep="_")     
    #Save output tables     
    norm_data <<- counts(dds[idx],normalized=TRUE)
    colnames(norm_data) <- colData(dds)[,DESIGN]
    write.table(norm_data,file=paste0(FileName, "_Norm_Data.txt"), sep="\t", quote=FALSE)
    write.table(FlagSpike,file=paste0(FileName, "_FlaggedSpikes.txt"), sep="\t", quote=FALSE)
    DEsamples <<- subset(res,res$padj < pAdjValue)  
    write.table(DEsamples,file=paste0(FileName,"_DEG_table.txt"), sep="\t", quote=FALSE)
    DEspikes <<- DEsamples[rownames(DEsamples)%in%NameRows,]    
    write.table(DEspikes,file=paste0(FileName,"_DEspikes_table.txt"), sep="\t",quote=FALSE)

    resList[[x]] <- res
    
    if (R_ODAF_plots==TRUE) {
    print("creating Read count Plots")
    # top DEGs
    plotdir<- paste(outputdir, "/plots/", sep="")
    if(!dir.exists(plotdir)) {dir.create(plotdir)}
    barplot.dir<- paste(plotdir, "/barplot_genes/", sep="")
    if(!dir.exists(barplot.dir)) {dir.create(barplot.dir)}
  
    TOPbarplot.dir<- paste(barplot.dir, "Top_DEGs/", sep="")
    if(!dir.exists(TOPbarplot.dir)) {dir.create(TOPbarplot.dir)}
    setwd(TOPbarplot.dir)
    draw.barplots(DEsamples, "top", 20) #(DEsamples, top_bottom, NUM)
    print("Top 20 DEG plots done")
  
    # Spurious spikes
    SPIKEbarplot.dir<- paste(barplot.dir, "DE_Spurious_spikes/", sep="")
    if(!dir.exists(SPIKEbarplot.dir)) {dir.create(SPIKEbarplot.dir)}
    setwd(SPIKEbarplot.dir)
    draw.barplots(DEspikes, "top", nrow(DEspikes)) #(DEsamples, top_bottom, NUM)
    print("All DE_Spurious_spike plots done")
    }
}
## [1] "1 5adC  vs  0 Naive : 2020__DESeq2_RNA-Seq"
## [1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
## [1] "AND"
## [1] "Detecting spurious spikes: Max-Median > Sum/(Rep+1)"
## [1] "Obtaining the DESeq2 results"
## [1] "5 5adC  vs  0 Naive : 2020__DESeq2_RNA-Seq"
## [1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
## [1] "AND"
## [1] "Detecting spurious spikes: Max-Median > Sum/(Rep+1)"
## [1] "Obtaining the DESeq2 results"
## [1] "1 5adC  vs  0 Vehicle : 2020__DESeq2_RNA-Seq"
## [1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
## [1] "AND"
## [1] "Detecting spurious spikes: Max-Median > Sum/(Rep+1)"
## [1] "Obtaining the DESeq2 results"
## [1] "5 5adC  vs  0 Vehicle : 2020__DESeq2_RNA-Seq"
## [1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
## [1] "AND"
## [1] "Detecting spurious spikes: Max-Median > Sum/(Rep+1)"
## [1] "Obtaining the DESeq2 results"
## [1] "0 Vehicle  vs  0 Naive : 2020__DESeq2_RNA-Seq"
## [1] "Filtering genes: 75% of at least 1 group need to be above 1 CPM"
## [1] "AND"
## [1] "Detecting spurious spikes: Max-Median > Sum/(Rep+1)"
## [1] "Obtaining the DESeq2 results"

The code above (not shown by default) runs the R-ODAF spurious spike detection and outputs the DESeq Results objects as a list for each contrast. As specified by the R-ODAF guidelines, 75% of at least 1 group need to be above 1 CPM and spurious spikes were removed in which Max-Median > Sum/(Rep+1).

The log2FoldChange shrinkage procedure used was ashr. An alpha of 0.1 was used to extract raw results, which are reported as the Wald test p-value. To account for multiple testing, fdr adjusted p-values are reported. Cook’s cutoff was set to FALSE in this analysis.

2.8 Create summary tables

#listEnsembl()
#listMarts()
#listDatasets(useMart('ensembl'))
ensembl <- useMart("ensembl", dataset = "mauratus_gene_ensembl",  host = "useast.ensembl.org") #  Mouse: "mmusculus_gene_ensembl" # Human: "hsapiens_gene_ensembl" # Golden hamster: mauratus_gene_ensembl # Rat: rnorvegicus_gene_ensembl
#listAttributes(ensembl)
#listFilters(ensembl)

genes <- row.names(resList[[1]])

#genes_all <- unique(row.names(assay(dds)))
setwd(outputdir)
if (file.exists("id_table.Rdata")) {
  print(paste("Already found ID table; skipping biomaRt query and loading from disk."))
  load("./id_table.Rdata")
  id_table <- id_table_entrez[,1:3]
  } else {
  id_table_entrez <- getBM(filters="ensembl_gene_id",
                        attributes= c("ensembl_gene_id",
                                      "external_gene_name", # mgi_symbol for Mouse
                                      "description",
                                      "entrezgene_id"), # "refseq_mrna" or "refseq_peptide" may be of interest, but can't join with gene tables.
                        values=genes,
                        mart=ensembl)
  save(id_table_entrez, file="./id_table.Rdata")
  id_table <- id_table_entrez[,1:3]
}
## [1] "Already found ID table; skipping biomaRt query and loading from disk."

genes_entrezid <- dplyr::left_join(data.frame(ensembl_gene_id=genes), id_table_entrez, by="ensembl_gene_id")

sigtabList <- list()
alltablList <- list()
for (i in 1:length(resList)) {
  print(i)
  sigTab <- resList[[i]]
  # Add taxonomy
  if (nrow(sigTab) == 0) {
    next
  } else {
    sigTab <- cbind(ensembl_gene_id=row.names(resList[[i]]),
                    as(sigTab, "data.frame"),
                    contrast = gsub(pattern = paste0("log2.*", DESIGN, "\ "),
                                    replacement =  "",
                                    x = resList[[i]]@elementMetadata[[2]][2]))
    sigTab <- dplyr::left_join(sigTab,
                               id_table,
                               by="ensembl_gene_id")
    sigTab <- dplyr::mutate(sigTab, linearFoldChange=ifelse(log2FoldChange > 0,
                                                            2 ^ log2FoldChange,
                                                            -1 / (2 ^ log2FoldChange)))
    #sigTab <- sigTab[,c(1,9,10,2,3,11,5:8)] # Reorder columns
    sigTab <- sigTab[,c(1,8,9,2,3,10,4:7)] 
    alltablList[[i]] <- sigTab
    sigTab <- sigTab[!is.na(sigTab$padj) & sigTab$padj < alpha & abs(sigTab$linearFoldChange) > linear_fc_filter, ] ## FILTERS!
    sigtabList[[i]] <- sigTab %>% dplyr::distinct()
  }
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# Write dataframe of all results
sigtabList <- sigtabList[!sapply(sigtabList, is.null)]
significantResults <- do.call(rbind, sigtabList)
significantResults <- dplyr::distinct(significantResults)
allResults <- do.call(rbind, alltablList)

# All Results for Plotting
# Replace NA gene symbols with ensembl ID
allResults$external_gene_name[is.na(allResults$external_gene_name)] <- allResults$ensembl_gene_id[is.na(allResults$external_gene_name)] 
# Replace blank gene symbols with ensembl ID
allResults$external_gene_name[allResults$external_gene_name == ""]  <- allResults$ensembl_gene_id[allResults$external_gene_name == ""]  
allResults$padj[allResults$padj == 0] <- 10^-100 # For plotting purposes!
allResultsOrdered_logFC_filter <- dplyr::filter(allResults, abs(linearFoldChange) > 1.5) %>%
  arrange(-abs(linearFoldChange))
allResultsOrdered_logFC_filter <- dplyr::filter(allResultsOrdered_logFC_filter, padj < alpha)
res.df <- allResultsOrdered_logFC_filter

degTable <- significantResults %>% 
  dplyr::group_by(contrast) %>%
  dplyr::count()

lengths <- lapply(resList, nrow)
longest <- which.max(lengths)
summaryTable <- data.frame( ensembl_gene_id=row.names(resList[[longest]]),
                            baseMean=resList[[longest]]$baseMean )
contrastsInSummary <- vector()
for (i in 1:length(resList)) {
  print(i)
  n <- resList[[i]]@elementMetadata[[2]][2]
  n <- gsub(pattern = paste0("log2\ fold\ change\ \\(MMSE\\):\ ",DESIGN), # Might need to be MLE for some tests
            replacement =  paste0("log2 Fold Change"),
            x = resList[[i]]@elementMetadata[[2]][2])
  p <- gsub(pattern = paste0("log2\ fold\ change\ \\(MMSE\\):\ ",DESIGN),
           replacement =  resList[[i]]@elementMetadata[[2]][6], # Not used?
           x = resList[[i]]@elementMetadata[[2]][2])
  q <- gsub(pattern = paste0("log2\ fold\ change\ \\(MMSE\\):\ ",DESIGN,"\ "),
            replacement =  "",
            x = resList[[i]]@elementMetadata[[2]][2])
  message(n)
  message(p)
  message(q)
  toJoin <- as.data.frame(resList[[i]])
  setDT(toJoin, keep.rownames = T)[]
  setnames(toJoin, 1, "ensembl_gene_id")
  toJoin <- mutate(toJoin, linearFoldChange=ifelse(log2FoldChange > 0,
                                                2 ^ log2FoldChange,
                                                -1 / (2 ^ log2FoldChange)))
  toJoin <- toJoin[,c(1:3,7,4:6)]
  summaryTable <- dplyr::left_join(summaryTable, dplyr::select(toJoin, !c(baseMean,pvalue,lfcSE)), by="ensembl_gene_id")

  names(summaryTable)[[ncol(summaryTable)-2]] <- paste0("log2FoldChange_",i)
  names(summaryTable)[[ncol(summaryTable)-1]] <- paste0("linearFoldChange_",i)
  names(summaryTable)[[ncol(summaryTable)]] <- paste0("FDR_",i)
  contrastsInSummary[i] <- q
  print(summary(resList[[i]], pAdjValue))
}
## [1] 1
## 
## out of 11554 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1776, 15%
## LFC < 0 (down)     : 1600, 14%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## NULL
## [1] 2
## 
## out of 11554 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2585, 22%
## LFC < 0 (down)     : 2467, 21%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## NULL
## [1] 3
## 
## out of 11554 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3516, 30%
## LFC < 0 (down)     : 3378, 29%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## NULL
## [1] 4
## 
## out of 11554 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3797, 33%
## LFC < 0 (down)     : 3591, 31%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## NULL
## [1] 5
## 
## out of 11554 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2368, 20%
## LFC < 0 (down)     : 2328, 20%
## outliers [1]       : 0, 0%
## low counts [2]     : 224, 1.9%
## (mean count < 13)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## 
## NULL
maxFCs <- allResults %>%
  dplyr::group_by(ensembl_gene_id) %>%
  dplyr::filter(abs(linearFoldChange) == max(abs(linearFoldChange))) %>%
  dplyr::ungroup() %>%
  dplyr::select(ensembl_gene_id, linearFoldChange)

minPvals <- allResults %>%
  group_by(ensembl_gene_id) %>%
  dplyr::filter(padj == min(padj)) %>%
  dplyr::ungroup() %>%
  dplyr::select(ensembl_gene_id, padj)

summaryTable <- summaryTable %>%
  left_join(id_table, by="ensembl_gene_id") %>%
  left_join(maxFCs, by="ensembl_gene_id") %>%
  left_join(minPvals, by="ensembl_gene_id") %>%
  dplyr::rename(maxFoldChange = linearFoldChange,
         minFDR_pval = padj
         ) %>%
  mutate(maxFoldChange=abs(maxFoldChange))

numColsToPrepend <- ncol(summaryTable) - 3*length(resList) - 2 # Number of columns per contrast = 3. Subtract two for the baseMean and ensembl_gene_id columns.
colPositionsToPrependSTART <- ncol(summaryTable) - numColsToPrepend + 1
colPositionsOfData <- ncol(summaryTable) - numColsToPrepend
summaryTable <- summaryTable[,c(1,
                                colPositionsToPrependSTART:ncol(summaryTable),
                                2:colPositionsOfData)]

CPMddsDF <- data.frame(ensembl_gene_id = row.names(CPMdds), CPMdds, check.names=F)
CPMddsDF <- dplyr::left_join(CPMddsDF, id_table, by="ensembl_gene_id")
numColsToPrepend <- ncol(CPMddsDF) - ncol(CPMdds) - 1
colPositionsToPrependSTART <- ncol(CPMddsDF) - numColsToPrepend + 1
colPositionsOfData <- ncol(CPMddsDF) - numColsToPrepend
CPMddsDF <- CPMddsDF[,c(1,colPositionsToPrependSTART:ncol(CPMddsDF),2:colPositionsOfData)]

The code above (not shown by default) generates tables summarzing the differentially expressed genes (DEGs).

Here is the number of DEGs in each group:

kable(degTable,
      caption="Number of differentially expressed genes across each contrast") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Number of differentially expressed genes across each contrast
contrast n
1 5adC vs 0 Naive 693
5 5adC vs 0 Naive 1195
1 5adC vs 0 Vehicle 1292
5 5adC vs 0 Vehicle 1786
0 Vehicle vs 0 Naive 212
#######################################
### Write results table from DESeq2
#######################################
setwd(outputdir)
write.table(allResults,         file=paste0(NORM_TYPE,"-DESeq_output_ALL.txt"),               quote=F, sep='\t', col.names=NA)
write.table(significantResults, file=paste0(NORM_TYPE,"-DESeq_output_significant.txt"),       quote=F, sep='\t', col.names=NA)
write.table(summaryTable,       file=paste0(NORM_TYPE,"-DESeq_output_all_genes.txt"),         quote=F, sep='\t', col.names=NA)
write.table(CPMddsDF,           file=paste0(NORM_TYPE,"-Per_sample_CPM.txt"),                 quote=F, sep='\t', col.names=NA)
write.table(Counts,             file=paste0(NORM_TYPE,"-Per_sample_normalized_counts.txt"),   quote=F, sep='\t', col.names=NA)

The code above (not shown by default) writes text files for each DEG summary type.

3 Write data

setwd(outputdir)
#######################################
### Write results above but in Excel
#######################################
### Global options
options("openxlsx.borderColour" = "#4F80BD")
options("openxlsx.borderStyle" = "thin")
options("openxlsx.maxWidth" = 50)
hs1 <- createStyle(textDecoration = "Bold",
                   border = "Bottom",
                   fontColour = "black")
hs2 <- createStyle(textDecoration = "Bold",
                   border = c("top", "bottom", "left", "right"),
                   fontColour = "black",
                   fgFill="#C5D9F1")

### Summary results - one gene per line, columns are contrasts
wb1 <- createWorkbook()
modifyBaseFont(wb1, fontSize = 10, fontName = "Arial Narrow")
addWorksheet(wb1, "DESeq_results_per_gene")
for (j in 1:length(contrastsInSummary)) {
  myStartcol=7+((j-1)*3)
  myEndcol=9+((j-1)*3)
  mergeCells(wb1,
             sheet = 1,
             cols = myStartcol:myEndcol,
             rows = 1)
  writeData(
    wb1,
    sheet = 1,
    x = contrastsInSummary[j],
    startCol = myStartcol,
    startRow = 1)
}
conditionalFormatting(wb1,
                      sheet = 1,
                      rows = 1,
                      cols = 1:ncol(summaryTable),
                      type = "contains",
                      rule = "",
                      style=hs2)
freezePane(wb1, sheet = 1, firstActiveRow = 3, firstActiveCol = 4)
writeDataTable(wb1,
               sheet = 1,
               startRow = 2,
               x = summaryTable,
               colNames = TRUE,
               rowNames = F,
               tableStyle = "none",
               headerStyle = hs1,
               keepNA = T,
               na.string = "NA")
setColWidths(wb1, sheet = 1, cols = 1:6, widths = "auto") # This is hard-coded, so prone to error; will only impact auto adjustment of col widths.
setColWidths(wb1, sheet = 1, cols = 7:ncol(summaryTable), widths = 13) # This is hard-coded, so prone to error; will only impact auto adjustment of col widths.
fname1 <- paste0("1.",NORM_TYPE,"-DESeq_by_gene.xlsx")
saveWorkbook(wb1, fname1, overwrite = TRUE)

### All results in one table
wb2 <- createWorkbook()
modifyBaseFont(wb2, fontSize = 10, fontName = "Arial Narrow")
addWorksheet(wb2, paste0("FDR",pAdjValue,".Linear.FC.",linear_fc_filter))
freezePane(wb2, sheet = 1, firstRow = TRUE, firstActiveCol = 4)
writeDataTable(wb2,
               sheet = 1,
               x = significantResults,
               colNames = TRUE,
               rowNames = F,
               tableStyle = "none",
               headerStyle = hs1,
               keepNA = T,
               na.string = "NA")
setColWidths(wb2, sheet = 1, cols = 1:ncol(significantResults), widths = "auto")
addWorksheet(wb2, "DESeq_all_results")
freezePane(wb2, sheet = 2, firstRow = TRUE, firstActiveCol = 4)
writeDataTable(wb2,
               sheet = 2,
               x = allResults,
               colNames = TRUE,
               rowNames = F,
               tableStyle = "none",
               headerStyle = hs1,
               keepNA = T,
               na.string = "NA")
setColWidths(wb2, sheet = 2, cols = 1:ncol(allResults), widths = "auto")
saveWorkbook(wb2, paste0("2.",NORM_TYPE,"-DESeq_all.xlsx"), overwrite = TRUE)

### All results with different tabs for each contrast
wb3 <- createWorkbook()
modifyBaseFont(wb3, fontSize = 10, fontName = "Arial Narrow")

short_contrast_names <- stringr::str_trunc(short_contrast_names, 30, side="center")

for (i in 1:length(levels(allResults$contrast))) {
  print(i)
  dataToWrite <- allResults[allResults$contrast==levels(allResults$contrast)[i],]
  addWorksheet(wb3, short_contrast_names[i])
  freezePane(wb3, sheet = i, firstRow = TRUE, firstActiveCol = 4)
  writeDataTable(wb3,
                 sheet = i,
                 x = dataToWrite,
                 colNames = TRUE,
                 rowNames = F,
                 tableStyle = "none",
                 headerStyle = hs1,
                 keepNA = T,
                 na.string = "NA")
  setColWidths(wb3, sheet = i, cols = 1:ncol(dataToWrite), widths = "auto")
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
saveWorkbook(wb3, paste0("3.",NORM_TYPE,"-DESeq_by_contrast.xlsx"), overwrite = TRUE)

### CPM
wb4 <- createWorkbook()
modifyBaseFont(wb4, fontSize = 10, fontName = "Arial Narrow")
addWorksheet(wb4, "Counts per million")
freezePane(wb4, sheet = 1, firstRow = TRUE, firstActiveCol = 4)
writeDataTable(wb4,
               sheet = 1,
               x = as.data.frame(CPMddsDF),
               colNames = TRUE,
               rowNames = F,
               tableStyle = "none",
               headerStyle = hs1,
               keepNA = T,
               na.string = "NA")
setColWidths(wb4, sheet = 1, cols = 1:ncol(CPMddsDF), widths = "auto")
saveWorkbook(wb4, paste0("4.",NORM_TYPE,"-CPM.xlsx"), overwrite = TRUE)

The code above (not shown by default) writes Excel workbooks and text files of DEG lists.

These files should be provided to you as a separate zip file.

setwd(outputdir)
xfun::embed_files(list.files(".", "[.](xlsx)$"), name = "RNASeq_Spreadsheets.zip")
setwd(outputdir)
xfun::embed_files(list.files(".", "[.](txt)$"), name = "RNASeq_Text_Files.zip")
setwd(outputdir)
xfun::embed_dir(ODAFdir, name = "RNASeq_R-ODAF_text_files.zip")
# Save RData here if flag is TRUE.
setwd(outputdir)
save.image(file = "Partial_analysis.RData")
# Add a flag to the start of the script with if statements in all code chunks to check if the flag is set.
# If flag is TRUE, run analysis; if flag is FALSE, SKIP analysis and continue here (just to modify plotting output).

4 PCA Plots

4.1 All genes, before filtering

## Perform PCA analysis and make plot
plotPCA(rld, intgroup = intgroup, ntop=nrow(assay(rld)))

## Get percent of variance explained
data_pca <- plotPCA(rld, intgroup = c("dose","group","treatment"), ntop=nrow(assay(rld)), returnData = TRUE)
percentVar <- round(100 * attr(data_pca, "percentVar"))

# PCAplot <- plotPCA(rld, intgroup = c("dose","group","treatment"), ntop=nrow(assay(rld)), returnData = TRUE)
# ggplot(PCAplot, aes(PC1, PC2, color=dose)) + 
#   geom_point(size=1) +
#   xlab(paste0("PC1: ",percentVar[1],"% variance")) +
#   ylab(paste0("PC2: ",percentVar[2],"% variance")) +
#   coord_fixed()

This plot shows the first two principal components that explain the variability in the data using the regularized log count data. If you are unfamiliar with principal component analysis, you might want to check the Wikipedia entry or this interactive explanation. In this case, the first and second principal component explain 60 and 16 percent of the variance respectively.

4.2 DEGs only

rld_degs <- rld[row.names(assay(rld)) %in% significantResults$ensembl_gene_id,]
## Perform PCA analysis and make plot
plotPCA(rld_degs, intgroup = c("dose","group","treatment"), ntop=nrow(rld_degs)) #+ theme(legend.position = "none")

PCAplotDEGs <- plotPCA(rld_degs, intgroup = c("dose","group","treatment"), ntop=nrow(assay(rld_degs)), returnData = TRUE)

# ggplot(PCAplotDEGs, aes(PC1, PC2, color=dose)) + 
#   geom_point(size=1) +
#   xlab(paste0("PC1: ",percentVar[1],"% variance")) +
#   ylab(paste0("PC2: ",percentVar[2],"% variance")) +
#   facet_wrap(~group) +
#   coord_fixed()

## Get percent of variance explained
data_pcaDEGs <- plotPCA(rld_degs, intgroup = intgroup, returnData = TRUE, ntop=nrow(rld_degs))
percentVarDEGs <- round(100 * attr(data_pcaDEGs, "percentVar"))

This plot shows the principal components analysis limited to all DEGs. In this case, the first and second principal component explain 78 and 12 percent of the variance respectively.

4.3 Top 100 most variable genes only

# Run this code only once for both the PCA and clustering analysis
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[1:nBest]

rld_top <- rld[select,]

## Perform PCA analysis and make plot
plotPCA(rld_top, intgroup = intgroup, ntop=nrow(rld_top))

## Get percent of variance explained
data_pcaTop <- plotPCA(rld_top, intgroup = intgroup, returnData = TRUE, ntop=nrow(rld_top))
percentVarTop <- round(100 * attr(data_pcaTop, "percentVar"))

This plot shows the principal components analysis limited to all DEGs. In this case, the first and second principal component explain 88 and 7 percent of the variance respectively.

5 Sample-to-sample distances

5.1 All genes, before filtering

## Obtain the sample euclidean distances
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
## Add names based on intgroup
rownames(sampleDistMatrix) <- apply(as.data.frame(colData(rld)[, intgroup]), 1,
    paste, collapse = ' : ')
colnames(sampleDistMatrix) <- NULL

## Define colors to use for the heatmap
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

## Make the heatmap
pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists,
    clustering_distance_cols = sampleDists, color = colors)

This plot shows how samples are clustered based on their euclidean distance using the regularized log transformed count data. This figure gives an overview of how the samples are hierarchically clustered. It is a complementary figure to the PCA plot.

5.2 DEGs only

## Limit to the top n genes
# rv = rowVars(assay(rld))
# select = order(rv, decreasing = TRUE)[1:nBest]
## Obtain the sample euclidean distances
# sampleDistsTop <- dist(t(assay(rld)[select,]))
sampleDistsDEGs <- dist(t(assay(rld_degs)))
# sampleDistMatrixTop <- as.matrix(sampleDistsTop)
sampleDistMatrixDEGs <- as.matrix(sampleDistsDEGs)

## Add names based on intgroup
rownames(sampleDistMatrixDEGs) <- apply(as.data.frame(colData(rld)[, intgroup]), 1,
    paste, collapse = ' : ')
colnames(sampleDistMatrixDEGs) <- NULL

## Define colors to use for the heatmap
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

## Make the heatmap
pheatmap(sampleDistMatrixDEGs,
         clustering_distance_rows = sampleDistsDEGs,
         clustering_distance_cols = sampleDistsDEGs,
         color = colors)

This plot shows how samples are clustered based on their euclidean distance using the regularized log transformed count data of all DEGs. This figure gives an overview of how the samples are hierarchically clustered. It is a complementary figure to the PCA plot.

5.3 Most variable genes only

## Limit to the top n genes
## Using the select object from PCA code above...
## Obtain the sample euclidean distances
sampleDistsTop <- dist(t(assay(rld)[select,]))
sampleDistMatrixTop <- as.matrix(sampleDistsTop)

## Add names based on intgroup
rownames(sampleDistMatrixTop) <- apply(as.data.frame(colData(rld)[, intgroup]), 1,
    paste, collapse = ' : ')
colnames(sampleDistMatrixTop) <- NULL

## Define colors to use for the heatmap
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

## Make the heatmap
pheatmap(sampleDistMatrixTop,
         clustering_distance_rows = sampleDistsTop,
         clustering_distance_cols = sampleDistsTop,
         color = colors)

This plot shows how samples are clustered based on their euclidean distance using the regularized log transformed count data of the top 100 most variable genes. This figure gives an overview of how the samples are hierarchically clustered. It is a complementary figure to the PCA plot.

6 Heatmaps

6.1 All DEGs


mat <- assay(rld)[row.names(assay(rld)) %in% significantResults$ensembl_gene_id,]
mat <- mat - rowMeans(mat)

# quantile_breaks <- function(xs, n = 10) {
#   breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
#   breaks[!duplicated(breaks)]
# }
# 
# mat_breaks <- quantile_breaks(mat, n = 11)
# pheatmap(mat,
#          annotation_col=heatmap_df,
#          show_rownames = FALSE,
#          border_color = NA,
#          scale="row",
#          breaks = mat_breaks)

heatmap_df <- as.data.frame(colData(rld)[,c("dose","group","treatment")]) # Customize!
pheatmap(mat,
         annotation_col=heatmap_df,
         show_rownames = FALSE,
         border_color = NA,
         scale="row")

# color = inferno(10)

chems=unique(DESeqDesign$chemical)
# plot_list=list()
for (chem in chems){

samples_for_heatmap <- DESeqDesign[which(DESeqDesign$chemical %in% c(chem, "DMSO 0.1%")),]$original_names
mat_subset <- mat[,samples_for_heatmap]

genes_for_heatmap_subset <- row.names(mat_subset)
genes_for_heatmap_subset <- data.frame(ensembl_gene_id=row.names(mat_subset)) %>%
  dplyr::left_join(id_table,
                   by="ensembl_gene_id") %>%
  dplyr::distinct()
genes_for_heatmap_subset$external_gene_name[is.na(genes_for_heatmap_subset$external_gene_name)]  <- genes_for_heatmap_subset$ensembl_gene_id[is.na(genes_for_heatmap_subset$external_gene_name)]
genes_for_heatmap$external_gene_name[which(genes_for_heatmap$external_gene_name=="")]  <- genes_for_heatmap$ensembl_gene_id[which(genes_for_heatmap$external_gene_name=="")] 
row.names(genes_for_heatmap_subset) <- genes_for_heatmap_subset$ensembl_gene_id

# pheatmap(mat_subset,
#          #color = rev(brewer.pal(11,"RdBu")), # inferno(10),
#          annotation_col=dplyr::select(heatmap_df,-original_names),
#          labels_row=genes_for_heatmap_subset[,2],
#          border_color = NA,
#          scale = "row",
#          cutree_rows = 3,
#          cutree_cols = 4)

mat_subset_ordered <- mat_subset[,dplyr::arrange(heatmap_df[which(heatmap_df$original_names %in% samples_for_heatmap), ],dose)$original_names]

# No clustering on columns
pheatmap(mat_subset_ordered,
         #color = rev(brewer.pal(11,"RdBu")), # inferno(10),
         annotation_col=dplyr::select(heatmap_df,-original_names),
         cluster_cols = F,
         labels_row=genes_for_heatmap_subset[,2],
         border_color = NA,
         scale = "row",
         cutree_rows = 3,
         cutree_cols = 4)

# x=pheatmap(mat,
#            annotation_col=heatmap_df,
#            show_rownames = FALSE,
#            border_color = NA,
#            scale="row")
#   plot_list[[a]] = x[[4]]     ##to save each plot into a list. note the [[4]]
# }
# do.call(grid.arrange,plot_list)

}

6.2 Top 50 differentially abundant genes

mat_top <- assay(rld)[row.names(assay(rld)) %in% allResultsOrdered_logFC_filter[1:nHeatmapDEGs,]$ensembl_gene_id,]
mat_top <- mat_top - rowMeans(mat_top)

genes_for_heatmap <- row.names(mat_top)
genes_for_heatmap <- data.frame(ensembl_gene_id=row.names(mat_top)) %>%
  dplyr::left_join(id_table,
                   by="ensembl_gene_id") %>%
  dplyr::distinct()
## Warning: Column `ensembl_gene_id` joining factor and character vector, coercing
## into character vector
genes_for_heatmap$external_gene_name[is.na(genes_for_heatmap$external_gene_name)]  <- genes_for_heatmap$ensembl_gene_id[is.na(genes_for_heatmap$external_gene_name)]
genes_for_heatmap$external_gene_name[which(genes_for_heatmap$external_gene_name=="")]  <- genes_for_heatmap$ensembl_gene_id[which(genes_for_heatmap$external_gene_name=="")] 
row.names(genes_for_heatmap) <- genes_for_heatmap$ensembl_gene_id

pheatmap(mat_top,
         annotation_col=heatmap_df,
         labels_row=genes_for_heatmap[,2],
         show_rownames = TRUE,
         border_color = NA,
         scale="row")

# color = inferno(10)

6.3 Top 50 variable genes


# sliderInput("nHeatmapShiny", label = "Bandwidth adjustment:",
#              min = 10, max = 1000, value = 100, step = 10)

# Interactive heatmap

#renderPlot({
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[1:nHeatmap]
matRV <- assay(rld)[select,]
matRV <- matRV - rowMeans(matRV)
genes_for_heatmap <- row.names(matRV)
genes_for_heatmap <- data.frame(ensembl_gene_id=row.names(matRV)) %>%
  dplyr::left_join(id_table,
                   by="ensembl_gene_id") %>%
  dplyr::distinct()
## Warning: Column `ensembl_gene_id` joining factor and character vector, coercing
## into character vector
genes_for_heatmap$external_gene_name[is.na(genes_for_heatmap$external_gene_name)]  <- genes_for_heatmap$ensembl_gene_id[is.na(genes_for_heatmap$external_gene_name)]  
genes_for_heatmap$external_gene_name[which(genes_for_heatmap$external_gene_name=="")]  <- genes_for_heatmap$ensembl_gene_id[which(genes_for_heatmap$external_gene_name=="")] 
row.names(genes_for_heatmap) <- genes_for_heatmap$ensembl_gene_id

pheatmap(matRV,
         #color = rev(brewer.pal(11,"RdBu")), # inferno(10),
         annotation_col=heatmap_df,
         labels_row=genes_for_heatmap[,2],
         border_color = NA,
         scale = "row",
         cutree_rows = 3,
         cutree_cols = 4)


#})

7 MA plots

This section contains three groups of MA plots (see Wikipedia) that compare the mean of the normalized counts against the log fold change. Each of the groups has a tab for each contrast. The plots show one point per feature. The points are shown in red if the feature has an adjusted p-value less than the cutoff listed in each section, that is, the statistically significant features are shown in red.

7.1 Filtered at 0.1

This first group of plots shows alpha = 0.1, which is the alpha value used to determine which resulting features were significant when running the function DESeq2::results().

## MA plot with alpha used in DESeq2::results()
for (i in 1:length(resList)) {
contrast = gsub(pattern = "log2.*Time\ ",
                replacement =  "",
                x = resList[[i]]@elementMetadata[[2]][2])
cat("###", contrast, " \n")
DESeq2::plotMA(resList[[i]], alpha = metadata(resList[[i]])$alpha, main = paste('MA plot with alpha =',
    metadata(resList[[i]])$alpha,',',contrast))
cat('\n\n')
}

7.1.1 log2 fold change (MMSE): group 1 5adC vs 0 Naive

7.1.2 log2 fold change (MMSE): group 5 5adC vs 0 Naive

7.1.3 log2 fold change (MMSE): group 1 5adC vs 0 Vehicle

7.1.4 log2 fold change (MMSE): group 5 5adC vs 0 Vehicle

7.1.5 log2 fold change (MMSE): group 0 Vehicle vs 0 Naive

7.2 Filtered at 0.05

This second group of MA plots uses alpha = 0.05 and can be used against the first MA plot to identify features which have adjusted p-values between 0.05 and 0.1.

## MA plot with alpha = 1/2 of the alpha used in DESeq2::results()
for (i in 1:length(resList)) {
contrast = gsub(pattern = "log2.*Time\ ",
                replacement =  "",
                x = resList[[i]]@elementMetadata[[2]][2])
cat("###", contrast, " \n")
DESeq2::plotMA(resList[[i]], alpha = metadata(resList[[i]])$alpha / 2,
    main = paste('MA plot with alpha =', metadata(resList[[i]])$alpha / 2,',',contrast))
cat('\n\n')
}

7.2.1 log2 fold change (MMSE): group 1 5adC vs 0 Naive

7.2.2 log2 fold change (MMSE): group 5 5adC vs 0 Naive

7.2.3 log2 fold change (MMSE): group 1 5adC vs 0 Vehicle

7.2.4 log2 fold change (MMSE): group 5 5adC vs 0 Vehicle

7.2.5 log2 fold change (MMSE): group 0 Vehicle vs 0 Naive

7.3 Top 100 features

The third and final set of MA plots uses an alpha such that the top 100 features are shown in the plot.

## MA plot with alpha corresponding to the one that gives the nBest features

for (i in 1:length(resList)) {
nBest.actual <- min(nBest, nrow(head(resList[[i]], n = nBest)))
nBest.alpha <- head( resList[[i]][order(resList[[i]]$pvalue),], n = nBest)$padj[nBest.actual]
contrast = gsub(pattern = "log2.*Time\ ",
                replacement =  "",
                x = resList[[i]]@elementMetadata[[2]][2])
cat("###", contrast, " \n")
DESeq2::plotMA(resList[[i]], alpha = nBest.alpha * 1.00000000000001,
    main = paste('MA plot for top', nBest.actual, 'features',',',contrast))
cat('\n\n')
}

7.3.1 log2 fold change (MMSE): group 1 5adC vs 0 Naive

7.3.2 log2 fold change (MMSE): group 5 5adC vs 0 Naive

7.3.3 log2 fold change (MMSE): group 1 5adC vs 0 Vehicle

7.3.4 log2 fold change (MMSE): group 5 5adC vs 0 Vehicle

7.3.5 log2 fold change (MMSE): group 0 Vehicle vs 0 Naive

8 P-values

8.1 Distribution of all p-values

## P-value histogram plot

ggplot(allResults, aes(x = pvalue)) +
    geom_histogram(alpha=.5, position='identity', bins = 50) +
    labs(title='Histogram of unadjusted p-values') +
    xlab('Unadjusted p-values') +
    facet_wrap( ~ contrast, ncol = 2)

This plot shows a histogram of the unadjusted p-values. It might be skewed right or left, or flat as shown in the Wikipedia examples. The shape depends on the percent of features that are differentially expressed. For further information on how to interpret a histogram of p-values check David Robinson’s post on this topic.

## P-value distribution summary
summary(allResults$pvalue)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000509 0.0332513 0.2040367 0.3422967 1.0000000

This is the numerical summary of the distribution of the p-values.

## Split features by different p-value cutoffs
pval_table <- lapply(c(1e-04, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
    0.6, 0.7, 0.8, 0.9, 1), function(x) {
    data.frame('Cut' = x, 'Count' = sum(allResults$pvalue <= x, na.rm = TRUE))
})
pval_table <- do.call(rbind, pval_table)
kable(pval_table, format = 'markdown', align = c('c', 'c'))
Cut Count
0.0001 15323
0.0010 19054
0.0100 24615
0.0250 27785
0.0500 30679
0.1000 34225
0.2000 38832
0.3000 42164
0.4000 44937
0.5000 47352
0.6000 49584
0.7000 51721
0.8000 53817
0.9000 55886
1.0000 57855

This table shows the number of features with p-values less or equal than some commonly used cutoff values.

8.2 Distribution of adjusted p-values

## Adjusted p-values histogram plot
ggplot(allResults, aes(x = padj)) +
    geom_histogram(alpha=.5, position='identity', bins = 50) +
    labs(title=paste('Histogram of', elementMetadata(resList[[1]])$description[grep('adjusted', elementMetadata(resList[[1]])$description)])) +
    xlab('Adjusted p-values') +
    xlim(c(0, 1.0005)) +
  facet_wrap( ~ contrast, ncol = 2, scales="free")
## Warning: Removed 224 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_bar).

This plot shows a histogram of the fdr adjusted p-values. It might be skewed right or left, or flat as shown in the Wikipedia examples.

## Adjusted p-values distribution summary
summary(res.df$padj)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 3.633e-04 3.263e-06 2.440e-02

This is the numerical summary of the distribution of the fdr adjusted p-values.

## Split features by different adjusted p-value cutoffs
padj_table <- lapply(c(1e-04, 0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
    0.6, 0.7, 0.8, 0.9, 1), function(x) {
    data.frame('Cut' = x, 'Count' = sum(res.df$padj <= x, na.rm = TRUE))
})
padj_table <- do.call(rbind, padj_table)
kable(padj_table, format = 'markdown', align = c('c', 'c'))
Cut Count
0.0001 4462
0.0010 4843
0.0100 5144
0.0250 5188
0.0500 5188
0.1000 5188
0.2000 5188
0.3000 5188
0.4000 5188
0.5000 5188
0.6000 5188
0.7000 5188
0.8000 5188
0.9000 5188
1.0000 5188

This table shows the number of features with fdr adjusted p-values less or equal than some commonly used cutoff values.

9 Tables of top features

This table shows the significant DEGs (passing all filtering criteria) ordered by their absolute fold change. Use the search function to find your feature of interest or sort by one of the columns. You can limit to a single contrast if desired.

searchURL <- "http://www.ncbi.nlm.nih.gov/gene/?term="
## Add search url if appropriate
res.df.dt <- res.df
if(!is.null(searchURL)) {
    res.df.dt$ensembl_gene_id <- paste0('<a href="',
                             searchURL,
                             res.df.dt$ensembl_gene_id,
                             '" rel="noopener noreferrer" target="_blank">',
                             res.df.dt$ensembl_gene_id,
                             '<br/>',
                             res.df.dt$external_gene_name,
                             '</a>')
}

for(i in which(colnames(res.df.dt) %in% c('pvalue', 'padj'))) res.df.dt[, i] <- format(res.df.dt[, i], scientific = TRUE)
res.df.dt <- res.df.dt[,!names(res.df.dt) %in% c("description", "external_gene_name")]
DT::datatable(res.df.dt,
          options = list(pagingType='full_numbers',
                         pageLength=20,
                         scrollX='100%',
                         dom = 'Bfrtip',
                         buttons = c('copy', 'csv', 'excel', 'pdf', 'print', 'colvis'),
                         columnDefs = list(list(visible=FALSE, targets=c(2,4,5)))),
          escape = FALSE,
          extensions = 'Buttons',
          rownames = FALSE,
          filter = "top",
          colnames = c('Ensembl' = 'ensembl_gene_id')) %>% 
  DT::formatRound(which(!colnames(res.df.dt) %in% c('pvalue', 'padj', 'Feature', 'contrast', 'description', 'ensembl_gene_id', 'external_gene_name' )), digits)

10 Gene-level plots for top 20 features

This section contains plots showing the normalized counts per sample for each group of interest. Only the best 20 features are shown, ranked by their absolute fold change values. The Y axis is on the log10 scale and the feature name is shown in the title of each plot.

plotCounts_gg <- function(i, dds, intgroup) {
    data <- plotCounts(dds,
                       gene=i,
                       intgroup=intgroup,
                       returnData = TRUE)
    ggplot(data, aes(x = data[,2], y = data[,1])) + 
      geom_point() + 
      ylab('Normalized count') +
      xlab('Group') + 
      ggtitle(paste0(i, " ", id_table$external_gene_name[id_table$ensembl_gene_id == i])) + 
      coord_trans(y = "log10") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

genesToPlot <- significantResults %>% arrange(-abs(log2FoldChange))

for(i in head(unique(genesToPlot$ensembl_gene_id), nBestFeatures)) {
    print(plotCounts_gg(i, dds = dds, intgroup = intgroup))
}

11 Plots of genes of interest

This section shows genes of interest sorted by those with highest fold-change within each contrast.

#numResults=20
numResults <- nBestFeatures

allResultsOrdered_logFC_filter %>%
  group_by(contrast) %>%
  top_n(numResults, wt=abs(log2FoldChange)) %>%
  ungroup() %>%
  mutate(contrast=as.factor(contrast),
         external_gene_name=reorder_within(external_gene_name,log2FoldChange, contrast)) %>%
  ggplot(aes(x=log2FoldChange,
             y=external_gene_name,
             color=contrast,
             size=-log(padj))) +
  geom_point(show.legend = TRUE) +
  facet_wrap(~contrast,
             scales="free_y",
             ncol=4,
             labeller = labeller(contrast = label_wrap_gen(10))) +
  scale_y_reordered()  +
  geom_vline(xintercept=0,
             linetype="dashed",
             color = "black",
             size=1) +
  ggtitle(paste0("Top ",
                 numResults,
                 " genes ranked by fold change (adjusted p-value <",
                 alpha,
                 "), grouped by treatment"))

12 Volcano plot

This section shows a volcano plot for each contrast.
Note that scales are set manually for this plot: therefore, there may be data points outside the range shown (see warnings).

ggplot(allResults, aes(x=log2FoldChange, y=-log10(padj))) +
  geom_point(size=0.1, alpha=0.5) +
  facet_wrap(~contrast, ncol=4) + # , scales="free"
  geom_vline(xintercept=c(-1.5,1.5), color="red", alpha=1.0)+ 
  geom_hline(yintercept=-log10(0.05), color="blue", alpha=1.0) +
  scale_x_continuous(name = "log2 Fold Change", limits = c(-5,5)) + # 
  scale_y_continuous(name = "-log10 adjusted p-value", limits = c(0,6)) # 
## Warning: Removed 9613 rows containing missing values (geom_point).


###########################################################################################
######## PRODUCE INPUT FOR BMDExpress2 ####################################################
###########################################################################################

# # Create input file...
#lognorm.read.counts <- as.data.frame(counts(dds, normalized=TRUE))
# rld normalized, size factor normalized, rounded to 4 significant figures
bmdexpress <- as.data.frame(assay(rld))
bmdexpress <- cbind(SampleID=row.names(bmdexpress), bmdexpress, stringsAsFactors=F)
bmdexpress <- rbind( Dose=c("#CLASS:DOSE",as.numeric(DESeqDesign$dose)), bmdexpress, stringsAsFactors=F)
write.table(bmdexpress, file = "bmdexpress_input.txt", quote = F, sep = "\t", row.names = F, col.names = T)

# chem_subset <- DESeqDesign %>% dplyr::filter(chemical %in% c("D-8","DMSO 0.1%")) %>% dplyr::select(original_names)
# bmdexpresstest <- bmdexpress %>% dplyr::select(chem_subset$original_names)
# bmdexpresstest <- cbind(SampleID=row.names(bmdexpresstest), bmdexpresstest, stringsAsFactors=F)
# bmdexpresstest <- rbind(Chemical=c("#CLASS:CHEMICAL", as.data.frame(DESeqDesign %>% 
#                                                                     dplyr::filter(chemical %in% c("D-8","DMSO 0.1%")))$chemical),
#                                     bmdexpresstest,
#                                     stringsAsFactors=F)
# bmdexpresstest <- rbind( Dose=c("#CLASS:DOSE", as.data.frame(DESeqDesign %>% 
#                                                            dplyr::filter(chemical %in% c("D-8","DMSO 0.1%")))$dose),
#                                 bmdexpresstest,
#                                 stringsAsFactors=F)
# write.table(bmdexpresstest, file = "bmdexpress_test_input.txt", quote = F, sep = "\t", row.names = F, col.names = T)



fastbmd <- as.data.frame(assay(dds))
fastbmd <- cbind(SampleID=row.names(fastbmd), fastbmd, stringsAsFactors=F)
fastbmd <- rbind( Chemical=c("#CLASS:CHEMICAL",as.character(DESeqDesign[DESeqDesign$original_names %in% colnames(fastbmd),]$chemical)),
                  fastbmd,
                  stringsAsFactors=F)
fastbmd <- rbind( Dose=c("#CLASS:DOSE",format(DESeqDesign[DESeqDesign$original_names %in% colnames(fastbmd),]$dose, scientific=F)),
                  fastbmd,
                  stringsAsFactors=F)
write.table(fastbmd, file = "fastbmd_input.txt", quote = F, sep = "\t", row.names = F, col.names = T)

13 GO Enrichment Analysis

This section performs GO enrichment on all DEGs passing filters.

## Remember that dds had ENSEMBL ids for the genes
# ensemblNames <- gsub("\\..*", "", rownames(dds))

ensemblDEGs <- significantResults$ensembl_gene_id
head(ensemblDEGs)
## [1] "ENSMAUG00000000110" "ENSMAUG00000000241" "ENSMAUG00000000263"
## [4] "ENSMAUG00000000269" "ENSMAUG00000000361" "ENSMAUG00000000409"
DEGs <- dplyr::left_join(data.frame(ensembl_gene_id=ensemblDEGs), id_table_entrez, by="ensembl_gene_id")
head(DEGs)
##      ensembl_gene_id external_gene_name
## 1 ENSMAUG00000000110               Fzd6
## 2 ENSMAUG00000000241               Lhx5
## 3 ENSMAUG00000000263              Synpr
## 4 ENSMAUG00000000269               Bnc1
## 5 ENSMAUG00000000361              Timp3
## 6 ENSMAUG00000000409            Ccdc148
##                                                          description
## 1         frizzled class receptor 6 [Source:NCBI gene;Acc:101840460]
## 2                    LIM homeobox 5 [Source:NCBI gene;Acc:101839931]
## 3                      synaptoporin [Source:NCBI gene;Acc:101834378]
## 4                      basonuclin 1 [Source:NCBI gene;Acc:101837263]
## 5 TIMP metallopeptidase inhibitor 3 [Source:NCBI gene;Acc:101831793]
## 6 coiled-coil domain containing 148 [Source:NCBI gene;Acc:101828531]
##   entrezgene_id
## 1     101840460
## 2     101839931
## 3     101834378
## 4     101837263
## 5     101831793
## 6     101828531
entrezDEGs <- as.character(DEGs$entrezgene_id)
entrezDEGs <- entrezDEGs[!is.na(entrezDEGs)]

#### DEFINE INPUTS
myDEGs <- entrezDEGs
# background <- row.names(assay(dds))
background <- dplyr::left_join(data.frame(ensembl_gene_id=row.names(assay(dds))), id_table_entrez, by="ensembl_gene_id") %>%
  dplyr::filter(!is.na(entrezgene_id)) %>%
  dplyr::pull(entrezgene_id)
background <- as.character(background)

entrezDEGs <- entrezDEGs[!is.na(entrezDEGs)]

##### TO DO: Add multiple panels for each contrast.
##### Use "all" argument to only run each once..?

## Not all genes have a p-value
table(!is.na(resList[[1]]$padj))
## 
##  TRUE 
## 11554

KeyType <- "ENTREZID" # ENSEMBL? ACCNUM? GID? ENTREZID?
# head(keys(OrgDb.Ma, keytype="ENTREZID"))

enrich_go_all <- enrichGO(gene = myDEGs,
                         universe = background, # All genes in dataset
                         OrgDb = OrgDb.Ma, # org.Mm.eg.db # org.Hs.eg.db # org.Rn.eg.db # OrgDb.Ma
                         keyType = KeyType,
                         ont = "all",
                         pAdjustMethod = "BH",
                         pvalueCutoff = 0.01,
                         qvalueCutoff = 0.05)

# clusterProfiler::dotplot(enrich_go_all, font.size=9, showCategory=10, split="ONTOLOGY")  +
#   theme(axis.text.y = element_text(angle = 0)) +
#   scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 50)) +
#   facet_wrap(~ONTOLOGY)

## Perform enrichment analysis for Biological Process (BP)
enrich_go_bp <- enrichGO(gene = myDEGs,
                         universe = background, # All genes in dataset
                         OrgDb = OrgDb.Ma, # org.Mm.eg.db # org.Hs.eg.db # org.Rn.eg.db
                         keyType = KeyType,
                         ont = "BP",
                         pAdjustMethod = "BH",
                         pvalueCutoff = 0.01,
                         qvalueCutoff = 0.05)

enrich_go_mf <- enrichGO(gene = myDEGs,
                         universe = background, # All genes in dataset
                         OrgDb = OrgDb.Ma, # org.Mm.eg.db # org.Hs.eg.db # org.Rn.eg.db
                         keyType = KeyType,
                         ont = "MF",
                         pAdjustMethod = "BH",
                         pvalueCutoff = 0.01,
                         qvalueCutoff = 0.05) 

enrich_go_cc <- enrichGO(gene = myDEGs,
                         universe = background, # All genes in dataset
                         OrgDb = OrgDb.Ma, # org.Mm.eg.db # org.Hs.eg.db # org.Rn.eg.db
                         keyType = KeyType,
                         ont = "CC",
                         pAdjustMethod = "BH",
                         pvalueCutoff = 0.01,
                         qvalueCutoff = 0.05)

13.1 Biological processes

clusterProfiler::dotplot(enrich_go_bp, font.size=9, showCategory=20)  +
  theme(axis.text.y = element_text(angle = 0)) +
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 30))

13.2 Molecular Functions

clusterProfiler::dotplot(enrich_go_mf, font.size=9)  +
   theme(axis.text.y = element_text(angle = 0)) +
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 30))

13.3 Cellular Component

clusterProfiler::dotplot(enrich_go_cc, font.size=9)  +
   theme(axis.text.y = element_text(angle = 0)) +
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 30))

14 Session Info

Date the report was generated.

## [1] "2020-10-26 20:37:28 UTC"

Wallclock time spent generating the report.

## Time difference of 5.115 mins

R session information.

## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl' had status 1
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.1 (2019-07-05)
##  os       Debian GNU/Linux 10 (buster)
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Etc/UTC                     
##  date     2020-10-26                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package                * version  date       lib source                                   
##  affy                     1.62.0   2019-05-02 [1] Bioconductor                             
##  affyio                   1.54.0   2019-05-02 [1] Bioconductor                             
##  annotate                 1.62.0   2019-05-02 [1] Bioconductor                             
##  AnnotationDbi          * 1.46.1   2019-08-20 [1] Bioconductor                             
##  AnnotationHub          * 2.16.1   2019-09-04 [1] Bioconductor                             
##  ashr                     2.2-47   2020-02-20 [1] CRAN (R 3.6.1)                           
##  assertthat               0.2.1    2019-03-21 [2] CRAN (R 3.6.1)                           
##  backports                1.1.9    2020-08-24 [1] CRAN (R 3.6.1)                           
##  base64enc                0.1-3    2015-07-28 [2] CRAN (R 3.6.1)                           
##  bibtex                   0.4.2.2  2020-01-02 [1] CRAN (R 3.6.1)                           
##  Biobase                * 2.44.0   2019-05-02 [1] Bioconductor                             
##  BiocFileCache          * 1.8.0    2019-05-02 [1] Bioconductor                             
##  BiocGenerics           * 0.30.0   2019-05-02 [1] Bioconductor                             
##  BiocManager              1.30.10  2019-11-16 [2] CRAN (R 3.6.1)                           
##  BiocParallel           * 1.18.1   2019-08-06 [1] Bioconductor                             
##  BiocStyle                2.12.0   2019-05-02 [1] Bioconductor                             
##  biomaRt                * 2.40.5   2019-10-01 [1] Bioconductor                             
##  Biostrings               2.52.0   2019-05-02 [1] Bioconductor                             
##  bit                      4.0.4    2020-08-04 [1] CRAN (R 3.6.1)                           
##  bit64                    4.0.2    2020-07-30 [1] CRAN (R 3.6.1)                           
##  bitops                   1.0-6    2013-08-17 [2] CRAN (R 3.6.1)                           
##  blob                     1.2.1    2020-01-20 [1] CRAN (R 3.6.1)                           
##  BSgenome                 1.52.0   2019-05-02 [1] Bioconductor                             
##  bumphunter               1.26.0   2019-05-02 [1] Bioconductor                             
##  caTools                  1.18.0   2020-01-17 [1] CRAN (R 3.6.1)                           
##  checkmate                2.0.0    2020-02-06 [1] CRAN (R 3.6.1)                           
##  cli                      2.0.2    2020-02-28 [1] CRAN (R 3.6.1)                           
##  cluster                  2.1.0    2019-06-19 [2] CRAN (R 3.6.1)                           
##  clusterProfiler        * 3.12.0   2019-05-02 [1] Bioconductor                             
##  codetools                0.2-16   2018-12-24 [2] CRAN (R 3.6.1)                           
##  colorspace               1.4-1    2019-03-18 [2] CRAN (R 3.6.1)                           
##  cowplot                  1.0.0    2019-07-11 [1] CRAN (R 3.6.1)                           
##  crayon                   1.3.4    2017-09-16 [2] CRAN (R 3.6.1)                           
##  crosstalk                1.1.0.1  2020-03-13 [1] CRAN (R 3.6.1)                           
##  curl                     4.3      2019-12-02 [1] CRAN (R 3.6.1)                           
##  data.table             * 1.13.0   2020-07-24 [1] CRAN (R 3.6.1)                           
##  DBI                      1.1.0    2019-12-15 [1] CRAN (R 3.6.1)                           
##  dbplyr                 * 1.4.4    2020-05-27 [1] CRAN (R 3.6.1)                           
##  DEFormats                1.12.0   2019-05-02 [1] Bioconductor                             
##  DelayedArray           * 0.10.0   2019-05-02 [1] Bioconductor                             
##  derfinder                1.18.9   2019-09-20 [1] Bioconductor                             
##  derfinderHelper          1.18.1   2019-05-22 [1] Bioconductor                             
##  DESeq2                 * 1.24.0   2019-05-02 [1] Bioconductor                             
##  digest                   0.6.25   2020-02-23 [1] CRAN (R 3.6.1)                           
##  DO.db                    2.9      2020-04-15 [1] Bioconductor                             
##  doRNG                    1.8.2    2020-01-27 [1] CRAN (R 3.6.1)                           
##  DOSE                     3.10.2   2019-06-24 [1] Bioconductor                             
##  dplyr                  * 0.8.5    2020-03-07 [1] CRAN (R 3.6.1)                           
##  DT                     * 0.15     2020-08-05 [1] CRAN (R 3.6.1)                           
##  edgeR                  * 3.26.8   2019-09-01 [1] Bioconductor                             
##  ellipsis                 0.3.1    2020-05-15 [1] CRAN (R 3.6.1)                           
##  enrichplot             * 1.4.0    2019-05-02 [1] Bioconductor                             
##  europepmc                0.4      2020-05-31 [1] CRAN (R 3.6.1)                           
##  evaluate                 0.14     2019-05-28 [2] CRAN (R 3.6.1)                           
##  fansi                    0.4.1    2020-01-08 [1] CRAN (R 3.6.1)                           
##  farver                   2.0.3    2020-01-16 [1] CRAN (R 3.6.1)                           
##  fastmap                  1.0.1    2019-10-08 [2] CRAN (R 3.6.1)                           
##  fastmatch                1.1-0    2017-01-28 [1] CRAN (R 3.6.1)                           
##  fgsea                    1.10.1   2019-08-21 [1] Bioconductor                             
##  forcats                * 0.5.0    2020-03-01 [1] CRAN (R 3.6.1)                           
##  foreach                  1.5.0    2020-03-30 [1] CRAN (R 3.6.1)                           
##  foreign                  0.8-71   2018-07-20 [2] CRAN (R 3.6.1)                           
##  Formula                  1.2-3    2018-05-03 [1] CRAN (R 3.6.1)                           
##  genefilter               1.66.0   2019-05-02 [1] Bioconductor                             
##  geneplotter              1.62.0   2019-05-02 [1] Bioconductor                             
##  generics                 0.0.2    2018-11-29 [1] CRAN (R 3.6.1)                           
##  GenomeInfoDb           * 1.20.0   2019-05-02 [1] Bioconductor                             
##  GenomeInfoDbData         1.2.1    2020-05-25 [1] Bioconductor                             
##  GenomicAlignments        1.20.1   2019-06-18 [1] Bioconductor                             
##  GenomicFeatures          1.36.4   2019-07-10 [1] Bioconductor                             
##  GenomicFiles             1.20.0   2019-05-02 [1] Bioconductor                             
##  GenomicRanges          * 1.36.1   2019-09-06 [1] Bioconductor                             
##  ggforce                  0.3.2    2020-06-23 [1] CRAN (R 3.6.1)                           
##  ggplot2                * 3.3.2    2020-06-19 [1] CRAN (R 3.6.1)                           
##  ggplotify                0.0.5    2020-03-12 [1] CRAN (R 3.6.1)                           
##  ggraph                   2.0.3    2020-05-20 [1] CRAN (R 3.6.1)                           
##  ggrepel                  0.8.2    2020-03-08 [1] CRAN (R 3.6.1)                           
##  ggridges                 0.5.2    2020-01-12 [1] CRAN (R 3.6.1)                           
##  glue                     1.4.1    2020-05-13 [1] CRAN (R 3.6.1)                           
##  GO.db                    3.8.2    2020-05-25 [1] Bioconductor                             
##  GOSemSim                 2.10.0   2019-05-02 [1] Bioconductor                             
##  graphlayouts             0.7.0    2020-04-25 [1] CRAN (R 3.6.1)                           
##  gridExtra                2.3      2017-09-09 [2] CRAN (R 3.6.1)                           
##  gridGraphics             0.5-0    2020-02-25 [1] CRAN (R 3.6.1)                           
##  gtable                   0.3.0    2019-03-25 [2] CRAN (R 3.6.1)                           
##  highr                    0.8      2019-03-20 [2] CRAN (R 3.6.1)                           
##  Hmisc                    4.4-1    2020-08-10 [1] CRAN (R 3.6.1)                           
##  hms                      0.5.3    2020-01-08 [1] CRAN (R 3.6.1)                           
##  htmlTable                2.0.1    2020-07-05 [1] CRAN (R 3.6.1)                           
##  htmltools                0.5.0    2020-06-16 [1] CRAN (R 3.6.1)                           
##  htmlwidgets              1.5.1    2019-10-08 [2] CRAN (R 3.6.1)                           
##  httpuv                   1.5.2    2019-09-11 [2] CRAN (R 3.6.1)                           
##  httr                     1.4.1    2019-08-05 [2] CRAN (R 3.6.1)                           
##  igraph                   1.2.5    2020-03-19 [1] CRAN (R 3.6.1)                           
##  interactiveDisplayBase   1.22.0   2019-05-02 [1] Bioconductor                             
##  invgamma                 1.1      2017-05-07 [1] CRAN (R 3.6.1)                           
##  IRanges                * 2.18.3   2019-09-24 [1] Bioconductor                             
##  irlba                    2.3.3    2019-02-05 [1] CRAN (R 3.6.1)                           
##  iterators                1.0.12   2019-07-26 [1] CRAN (R 3.6.1)                           
##  janeaustenr              0.1.5    2017-06-10 [1] CRAN (R 3.6.1)                           
##  jpeg                     0.1-8.1  2019-10-24 [1] CRAN (R 3.6.1)                           
##  jsonlite                 1.6.1    2020-02-02 [1] CRAN (R 3.6.1)                           
##  kableExtra             * 1.1.0    2019-03-16 [1] CRAN (R 3.6.1)                           
##  knitcitations            1.0.10   2019-09-15 [1] CRAN (R 3.6.1)                           
##  knitr                  * 1.29     2020-06-23 [1] CRAN (R 3.6.1)                           
##  knitrBootstrap           1.0.2    2020-06-02 [1] Github (jimhester/knitrBootstrap@0f8d612)
##  labeling                 0.3      2014-08-23 [2] CRAN (R 3.6.1)                           
##  later                    1.0.0    2019-10-04 [2] CRAN (R 3.6.1)                           
##  lattice                * 0.20-38  2018-11-04 [2] CRAN (R 3.6.1)                           
##  latticeExtra             0.6-29   2019-12-19 [1] CRAN (R 3.6.1)                           
##  lazyeval                 0.2.2    2019-03-15 [2] CRAN (R 3.6.1)                           
##  lifecycle                0.2.0    2020-03-06 [1] CRAN (R 3.6.1)                           
##  limma                  * 3.40.6   2019-07-26 [1] Bioconductor                             
##  locfit                   1.5-9.4  2020-03-25 [1] CRAN (R 3.6.1)                           
##  lubridate                1.7.9    2020-06-08 [1] CRAN (R 3.6.1)                           
##  magrittr               * 1.5      2014-11-22 [2] CRAN (R 3.6.1)                           
##  markdown                 1.1      2019-08-07 [2] CRAN (R 3.6.1)                           
##  MASS                     7.3-51.4 2019-03-31 [2] CRAN (R 3.6.1)                           
##  Matrix                   1.2-17   2019-03-22 [2] CRAN (R 3.6.1)                           
##  matrixStats            * 0.56.0   2020-03-13 [1] CRAN (R 3.6.1)                           
##  memoise                  1.1.0    2017-04-21 [2] CRAN (R 3.6.1)                           
##  mime                     0.9      2020-02-04 [1] CRAN (R 3.6.1)                           
##  mixsqp                   0.3-43   2020-05-14 [1] CRAN (R 3.6.1)                           
##  munsell                  0.5.0    2018-06-12 [2] CRAN (R 3.6.1)                           
##  nnet                     7.3-12   2016-02-02 [2] CRAN (R 3.6.1)                           
##  openxlsx               * 4.1.5    2020-05-06 [1] CRAN (R 3.6.1)                           
##  pheatmap               * 1.0.12   2019-01-04 [1] CRAN (R 3.6.1)                           
##  pillar                   1.4.6    2020-07-10 [1] CRAN (R 3.6.1)                           
##  pkgconfig                2.0.3    2019-09-22 [2] CRAN (R 3.6.1)                           
##  plotly                 * 4.9.2.1  2020-04-04 [1] CRAN (R 3.6.1)                           
##  plyr                     1.8.6    2020-03-03 [1] CRAN (R 3.6.1)                           
##  png                      0.1-7    2013-12-03 [2] CRAN (R 3.6.1)                           
##  polyclip                 1.10-0   2019-03-14 [1] CRAN (R 3.6.1)                           
##  preprocessCore           1.46.0   2019-05-02 [1] Bioconductor                             
##  prettyunits              1.1.1    2020-01-24 [1] CRAN (R 3.6.1)                           
##  progress                 1.2.2    2019-05-16 [1] CRAN (R 3.6.1)                           
##  promises                 1.1.0    2019-10-04 [2] CRAN (R 3.6.1)                           
##  purrr                    0.3.4    2020-04-17 [1] CRAN (R 3.6.1)                           
##  qvalue                   2.16.0   2019-05-02 [1] Bioconductor                             
##  R6                       2.4.1    2019-11-12 [2] CRAN (R 3.6.1)                           
##  rappdirs                 0.3.1    2016-03-28 [2] CRAN (R 3.6.1)                           
##  RColorBrewer           * 1.1-2    2014-12-07 [2] CRAN (R 3.6.1)                           
##  Rcpp                     1.0.5    2020-07-06 [1] CRAN (R 3.6.1)                           
##  RCurl                    1.98-1.2 2020-04-18 [1] CRAN (R 3.6.1)                           
##  readr                    1.3.1    2018-12-21 [1] CRAN (R 3.6.1)                           
##  RefManageR               1.2.12   2019-04-03 [1] CRAN (R 3.6.1)                           
##  regionReport           * 1.18.2   2019-06-18 [1] Bioconductor                             
##  reshape2                 1.4.4    2020-04-09 [1] CRAN (R 3.6.1)                           
##  RJSONIO                  1.3-1.4  2020-01-15 [1] CRAN (R 3.6.1)                           
##  rlang                    0.4.6    2020-05-02 [1] CRAN (R 3.6.1)                           
##  RMariaDB               * 1.0.10   2020-08-27 [1] CRAN (R 3.6.1)                           
##  rmarkdown                2.3      2020-06-18 [1] CRAN (R 3.6.1)                           
##  rngtools                 1.5      2020-01-23 [1] CRAN (R 3.6.1)                           
##  rpart                    4.1-15   2019-04-12 [2] CRAN (R 3.6.1)                           
##  Rsamtools                2.0.3    2019-10-10 [1] Bioconductor                             
##  RSQLite                  2.2.0    2020-01-07 [1] CRAN (R 3.6.1)                           
##  rstudioapi               0.11     2020-02-07 [1] CRAN (R 3.6.1)                           
##  rtracklayer              1.44.4   2019-09-06 [1] Bioconductor                             
##  rvcheck                  0.1.8    2020-03-01 [1] CRAN (R 3.6.1)                           
##  rvest                    0.3.5    2019-11-08 [2] CRAN (R 3.6.1)                           
##  rWikiPathways          * 1.4.1    2019-07-30 [1] Bioconductor                             
##  S4Vectors              * 0.22.1   2019-09-09 [1] Bioconductor                             
##  scales                   1.1.1    2020-05-11 [1] CRAN (R 3.6.1)                           
##  sessioninfo            * 1.1.1    2018-11-05 [2] CRAN (R 3.6.1)                           
##  shiny                    1.5.0    2020-06-23 [1] CRAN (R 3.6.1)                           
##  SnowballC                0.7.0    2020-04-01 [1] CRAN (R 3.6.1)                           
##  SQUAREM                  2020.4   2020-08-26 [1] CRAN (R 3.6.1)                           
##  stringi                  1.4.6    2020-02-17 [1] CRAN (R 3.6.1)                           
##  stringr                  1.4.0    2019-02-10 [2] CRAN (R 3.6.1)                           
##  SummarizedExperiment   * 1.14.1   2019-07-31 [1] Bioconductor                             
##  survival                 3.2-3    2020-06-13 [1] CRAN (R 3.6.1)                           
##  tibble                   3.0.3    2020-07-10 [1] CRAN (R 3.6.1)                           
##  tidygraph                1.2.0    2020-05-12 [1] CRAN (R 3.6.1)                           
##  tidyr                    1.1.2    2020-08-27 [1] CRAN (R 3.6.1)                           
##  tidyselect               1.1.0    2020-05-11 [1] CRAN (R 3.6.1)                           
##  tidytext               * 0.2.5    2020-07-11 [1] CRAN (R 3.6.1)                           
##  tokenizers               0.2.1    2018-03-29 [1] CRAN (R 3.6.1)                           
##  triebeard                0.3.0    2016-08-04 [1] CRAN (R 3.6.1)                           
##  truncnorm                1.0-8    2018-02-27 [1] CRAN (R 3.6.1)                           
##  tweenr                   1.0.1    2018-12-14 [1] CRAN (R 3.6.1)                           
##  UpSetR                   1.4.0    2019-05-22 [1] CRAN (R 3.6.1)                           
##  urltools                 1.7.3    2019-04-14 [1] CRAN (R 3.6.1)                           
##  VariantAnnotation        1.30.1   2019-05-19 [1] Bioconductor                             
##  vctrs                    0.3.0    2020-05-11 [1] CRAN (R 3.6.1)                           
##  viridis                * 0.5.1    2018-03-29 [2] CRAN (R 3.6.1)                           
##  viridisLite            * 0.3.0    2018-02-01 [2] CRAN (R 3.6.1)                           
##  vsn                    * 3.52.0   2019-05-02 [1] Bioconductor                             
##  webshot                  0.5.2    2019-11-22 [1] CRAN (R 3.6.1)                           
##  withr                    2.2.0    2020-04-20 [1] CRAN (R 3.6.1)                           
##  xfun                     0.16     2020-07-24 [1] CRAN (R 3.6.1)                           
##  XML                      3.99-0.3 2020-01-20 [1] CRAN (R 3.6.1)                           
##  xml2                     1.3.2    2020-04-23 [1] CRAN (R 3.6.1)                           
##  xtable                   1.8-4    2019-04-21 [2] CRAN (R 3.6.1)                           
##  XVector                  0.24.0   2019-05-02 [1] Bioconductor                             
##  yaml                     2.2.1    2020-02-01 [1] CRAN (R 3.6.1)                           
##  zip                      2.1.0    2020-08-10 [1] CRAN (R 3.6.1)                           
##  zlibbioc                 1.30.0   2019-05-02 [1] Bioconductor                             
## 
## [1] /home/mpiage/R/x86_64-pc-linux-gnu-library/3.6
## [2] /modules/software/rlang/3.6.1/lib/R/library

Pandoc version used: 2.3.1.

# Save complete workspace
setwd(outputdir)
save.image(file = "Complete_analysis.RData")
# Add a flag to the start of the script with if statements in all code chunks to check if the flag is set.
# If flag is TRUE, run analysis; if flag is FALSE, SKIP analysis and continue here (just to modify plotting output).